The summary statistics and respective plots along with eSet creation is in 00_pml_summstats_eset.{Rmd, html}, imputation with random forest is in 00_imp_summary.{Rmd, html}, This file contains filtering of samples along with dimensionality reduction using pca, tsne and umap.
eSet <- readRDS(file.path(PATH, "data/2021_08_20_eset_imputed.RDS"))
table(eSet$Class)
##
## Cancer Control Dysplasia HkNR Inflammatory
## 9 18 22 17 11
eSet_wo_infl <- eSet[, which(pData(eSet)$Class != 'Inflammatory')]
cpm_eset <- eSet_wo_infl
exprs(cpm_eset) <- apply(exprs(cpm_eset), 2, function(x) {x/(sum(x)/1000000)})
print(dim(cpm_eset))
## Features Samples
## 21510 66
cpm_eset$Class <- recode(cpm_eset$Class, "Control"="1-Control", "HkNR"="2-HkNR", "Dysplasia"="3-Dysplasia", "Cancer"="4-OSCC")
cpm_eset$Class <- factor(cpm_eset$Class, levels = c("1-Control", "2-HkNR", "3-Dysplasia", "4-OSCC"))
set.seed(1234)
#exprs(cpm_eset)
exprsMat <- log2(exprs(cpm_eset)+1)
rv <- matrixStats::rowVars(exprsMat)
featureSet <- order(rv, decreasing = TRUE)
exprsToPlot <- exprsMat[featureSet, , drop = FALSE]
exprsToPlot <- scale(t(exprsToPlot))
keepFeature <- (matrixStats::colVars(exprsToPlot) > 0.001)
keepFeature[is.na(keepFeature)] <- FALSE
exprsToPlot <- exprsToPlot[, keepFeature]
pca <- prcomp(exprsToPlot)
percentVar <- pca$sdev ^ 2 / sum(pca$sdev ^ 2)
pcaVariance <- data.frame(percentVar)
rownames(pcaVariance) <- colnames(pca$x)
dtp <- data.frame(pca$x)
dtp$Sample <- colnames(cpm_eset)
dtp$Class <- pData(cpm_eset)$Class
dtp$Type <- pData(cpm_eset)$Type
pcXlab <- paste0('PC1', " ", toString(round(pcaVariance['PC1', ] * 100, 2)), "%","(Exp. Var)")
pcYlab <- paste0('PC2', " ", toString(round(pcaVariance['PC2', ] * 100, 2)), "%","(Exp. Var)")
plotly::ggplotly(ggplot(data = dtp, aes_string(x=dtp$PC1, y=dtp$PC2, label= "Sample", col = "Class", shape="Type"))+ ggplot2::geom_point() + ggplot2::labs(x = pcXlab, y = pcYlab))
set.seed(1234)
tsneOut <- Rtsne::Rtsne(exprsToPlot, perplexity = 5, initial_dims = max(50, ncol(cpm_eset)), max_iter = 200, pca=TRUE, pca_scale=TRUE )
tsneOut <- tsneOut$Y[, c(1, 2)]
dtp <- data.frame(tsneOut)
dtp$Sample <- colnames(cpm_eset)
dtp$Class <- pData(cpm_eset)$Class
dtp$Type <- pData(cpm_eset)$Type
plotly::ggplotly(ggplot(data = dtp, aes_string(x=dtp$X1, y=dtp$X2, label= "Sample", col = "Class", shape="Type"))+ ggplot2::geom_point()+ ggplot2::labs(x = 'tSNE1', y = 'tSNE2'))
ggstyle <- function(font="Helvetica", scale=1) {
fs <- function(x) x*scale # Dynamic font scaling
ggplot2::theme(
plot.title = ggplot2::element_text(family=font, size=fs(26), face="bold", color="#222222"),
plot.subtitle = ggplot2::element_text(family=font, size=fs(18), margin=ggplot2::margin(0,0,5,0)),
plot.caption = ggplot2::element_blank(),
legend.position = "right",
legend.text.align = 0,
legend.background = ggplot2::element_blank(),
legend.title = ggplot2::element_blank(),
legend.key = ggplot2::element_text(family=font, size=fs(20), face = "bold", color="#222222"),
legend.text = ggplot2::element_text(family=font, size=fs(20), face = "bold", color="#222222"),
axis.title = ggplot2::element_text(family=font, size=fs(18), face = "bold", color="#222222"),
axis.text = ggplot2::element_text(family=font, size=fs(18), face = "bold", color="#222222"),
axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
#axis.ticks = ggplot2::element_blank(),
axis.line = ggplot2::element_line(color="#222222"),
panel.grid.minor = ggplot2::element_line(color="#222222"),
panel.grid.major.y = ggplot2::element_line(color="#222222"),
panel.grid.major.x = ggplot2::element_line(color="#222222"),
panel.background = ggplot2::element_rect(fill="grey90"),
strip.background = ggplot2::element_rect(fill="grey90", colour = "black"),
strip.text = ggplot2::element_text(size=fs(22), hjust=0)
)
}
set.seed(1357)
var_genes <- apply(exprs(eSet_wo_infl), 1, var)
top_2000 <- names(var_genes[order(var_genes,decreasing = TRUE)][1:500])
exprsMat <- exprs(eSet_wo_infl[top_2000])
rv <- matrixStats::rowVars(exprsMat)
featureSet <- order(rv, decreasing = TRUE)
exprsToPlot <- exprsMat[featureSet, , drop = FALSE]
exprsToPlot <- scale(t(exprsToPlot))
keepFeature <- (matrixStats::colVars(exprsToPlot) > 0.001)
keepFeature[is.na(keepFeature)] <- FALSE
exprsToPlot <- exprsToPlot[, keepFeature]
#run PCA
pca <- prcomp(exprsToPlot)
percentVar <- pca$sdev ^ 2 / sum(pca$sdev ^ 2)
pcaVariance <- data.frame(percentVar)
rownames(pcaVariance) <- colnames(pca$x)
dtp <- data.frame(pca$x)
dtp$Sample <- colnames(cpm_eset)
dtp$Class <- pData(cpm_eset)$Class
dtp$Type <- pData(cpm_eset)$Type
custom.config <- umap::umap.defaults
custom.config$n_neighbors <- 5
custom.config$alpha <-0.1
custom.config$n_epochs <- 10000
custom.config$metric <- 'euclidean'
custom.config$init <- 'spectral'
umap_results <- umap::umap(dtp[,2:10], config = custom.config)
dtp <- data.frame(umap_results$layout)
dtp$Sample <- colnames(cpm_eset)
dtp$Class <- pData(cpm_eset)$Class
dtp$smoke <- as.character(pData(cpm_eset)$Smoking_status)
dtp$age <- pData(cpm_eset)$Age
dtp$sex <- pData(cpm_eset)$Sex
dtp$prog <- pData(cpm_eset)$Progression_status
p <- ggplot(data = dtp,
aes(x=X1,
y=X2,
label= Sample,
group=Class
)) +
ggplot2::geom_point(size = 4, aes(shape= Class, color=Class, fill=Class)) +
scale_shape_manual(values=c(21, 22, 23, 24))+
scale_color_npg(alpha = 0.7)+
scale_fill_npg(alpha = 0.7)+
ggplot2::labs(x = 'UMAP1', y = 'UMAP2') +
theme_bw() +
ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
axis.text.x = element_text(margin=ggplot2::margin(5, b=10)),
axis.text = element_text(size = 10, family='Helvetica', color="black"),
axis.title = element_text(size = 12, family='Helvetica', color="black"),
legend.text = ggplot2::element_text(family='Helvetica', size=(10), color="black")
)
p
#latest w/ progression status
p <- dtp %>% ggplot(aes(
x=X1,
y=X2,
label= Sample,
group=Class,
shape= Class,
color=Class
)) +
ggplot2::geom_point(size = 4, aes(shape= Class, color=Class, fill=Class)) +
scale_shape_manual(values=c(21, 22, 23, 24))+
scale_color_npg(alpha = 0.7)+
scale_fill_npg(alpha = 0.7)+
geom_point(data = dtp %>% filter((prog=="Stable")),
pch=21,
size=7, fill="grey",
alpha = 0.5,
colour="black") +
geom_point(data = dtp %>% filter((prog=="Progressed-SCC")),
pch=22,
size=7, fill="orange",
alpha = 0.5,
colour="red") +
geom_point(data = dtp %>% filter((prog=="Progressed-Dys")),
pch=22,
size=7,fill="lightblue",
alpha = 0.5,
colour="blue") +
ggplot2::labs(x = 'UMAP1', y = 'UMAP2') +
theme_bw() +
ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
axis.title = element_blank(),
legend.text = ggplot2::element_text(family='Helvetica', size=(10), color="black"), legend.position = "bottom")
p
set.seed(1234)
koutput <- kmeans(umap_results$layout, 3)
cpm_eset$cluster <- koutput$cluster
#match cluster #s to already implemented in umap
cpm_eset$cluster <- recode(cpm_eset$cluster, "3"="1", "2"="2", "1"="3")
dtp$cluster <- pData(cpm_eset)$cluster
umap_clust <- ggplot(data = dtp, aes(x=X1, y=X2, label= Sample, col = cluster, shape=Class))+
ggplot2::geom_point(size=4, aes(shape= Class, color=cluster, fill=cluster)) +
scale_shape_manual(values=c(21, 22, 23, 24))+
scale_color_manual(aesthetics = c("color", "fill"), values = c("3"="#41b6c4", "2"="#a1dab4", "1"="#fecc5c"))+
ggplot2::labs(x = 'UMAP1', y = 'UMAP2') +
theme_bw()+
ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
axis.title = element_blank())
umap_clust
##mlcust
library(mclust)
## Package 'mclust' version 5.4.9
## Type 'citation("mclust")' for citing this R package in publications.
BIC <- mclustBIC(exprsToPlot)
mod1 <- Mclust(exprsToPlot, x = BIC)
table(mod1$classification)
##
## 1 2 3
## 39 22 5
dtp$cluster <- mod1$classification[match(rownames(dtp),names(mod1$classification))]
ggplot(data = dtp, aes_string(x=dtp$X1, y=dtp$X2, label= "Sample", col = factor(dtp$cluster), shape='Class'))+ scale_colour_manual(values=c("red", "blue", "black")) +ggplot2::geom_point(size=3)+ ggplot2::labs(x = 'UMAP1', y = 'UMAP2')
#using mclust
clust_df <- data.frame(cluster=mod1$classification, class=cpm_eset$Class)
tab_df <- table(clust_df)
chiqs <- chisq.test(tab_df)
## Warning in chisq.test(tab_df): Chi-squared approximation may be incorrect
chiqs$residuals
## class
## cluster 1-Control 2-HkNR 3-Dysplasia 4-OSCC
## 1 -1.1149893 -0.3298529 0.5547002 1.1629144
## 2 1.6329932 0.5601120 -1.2309149 -1.1547005
## 3 -0.3113996 -0.2536718 1.0327956 -0.8257228
#using kmeans
kmeans_df <- data.frame(cluster=koutput$cluster, class=cpm_eset$Class)
tab_df <- table(kmeans_df)
chiqs <- chisq.test(tab_df)
## Warning in chisq.test(tab_df): Chi-squared approximation may be incorrect
chiqs$residuals
## class
## cluster 1-Control 2-HkNR 3-Dysplasia 4-OSCC
## 1 -2.1675406 -0.8775269 1.0606602 2.6130983
## 2 -0.1946247 2.1361835 -0.6454972 -1.6514456
## 3 2.4494897 -1.1202241 -0.4923660 -1.1547005
library(Biobase)
library(cba)
library(ComplexHeatmap)
library(scales)
library(circlize)
pml <- cpm_eset
exprs(pml) <- log2(exprs(pml)+1)
pml2K <- BS831::variationFilter(pml,ngenes=2000,score="mad",do.plot=TRUE)
## Warning: replacing previous import 'BiocManager::install' by 'devtools::install'
## when loading 'BS831'
## Warning: replacing previous import 'Biobase::combine' by 'gridExtra::combine'
## when loading 'BS831'
## Warning: replacing previous import 'DESeq2::plotMA' by 'limma::plotMA' when
## loading 'BS831'
## Warning: replacing previous import 'data.table::melt' by 'reshape2::melt' when
## loading 'BS831'
## Warning: replacing previous import 'data.table::dcast' by 'reshape2::dcast' when
## loading 'BS831'
## Warning: replacing previous import 'gplots::lowess' by 'stats::lowess' when
## loading 'BS831'
## Variation filtering based on mad .. done.
## Selecting top 2000 by mad .. done, 2000 genes selected.
## Creating scatter plot ..
## done.
annot <- pData(pml) %>%
dplyr::select(imputed_smoking_label,Sex,Progression_status,Class) %>%
dplyr::rename(Smoker=imputed_smoking_label,Progression=Progression_status) %>%
dplyr::mutate(Class = factor(Class, levels=c("1-Control","2-HkNR","3-Dysplasia","4-OSCC"))) %>%
dplyr::relocate(Class,Progression,Smoker,Sex)
hm_annot <- ComplexHeatmap::HeatmapAnnotation(
Smoker = annot$Smoker,
Sex = annot$Sex,
Progression = annot$Progression,
Class = annot$Class,
col = list(Class = c("1-Control" = "#E64B35B2",
"2-HkNR" = "#4DBBD5B2",
"3-Dysplasia" = "#00A087B2",
"4-OSCC" = "#3C5488B2"),
Progression = c("NA"="white",
"Stable" = "gray",
"Progressed-SCC" = "red",
"Progressed-Dys" = "pink"),
Smoker = c("No"="gray", "Yes"="black"),
Sex = c("F"="pink", "M"="lightblue")), na_col = "white")
pml2K_scaled <- BS831::scale_row(pml2K)
mad_genes <- rownames(exprs(pml2K_scaled))
HALLMARK <- msigdb_gsets("Homo sapiens", "H", "")
names(HALLMARK$genesets) <- names(HALLMARK$genesets) %>% strsplit( "HALLMARK_" ) %>% sapply( tail, 1 )
krt_genes <- mad_genes[grepl(pattern = "KRT", x = mad_genes)]
epi_genes <- unique(c(HALLMARK$genesets$EPITHELIAL_MESENCHYMAL_TRANSITION[HALLMARK$genesets$EPITHELIAL_MESENCHYMAL_TRANSITION %in% rownames(exprs(pml2K_scaled))], krt_genes))
epi_index <- match(epi_genes, rownames(exprs(pml2K_scaled)))
infl_genes <- unique(c(HALLMARK$genesets$INTERFERON_GAMMA_RESPONSE[HALLMARK$genesets$INTERFERON_GAMMA_RESPONSE %in% rownames(exprs(pml2K_scaled))], HALLMARK$genesets$INFLAMMATORY_RESPONSE[HALLMARK$genesets$INFLAMMATORY_RESPONSE %in% rownames(exprs(pml2K_scaled))]))
infl_index <- match(infl_genes, rownames(exprs(pml2K_scaled)))
dend = as.dendrogram(hclust(dist(exprs(pml2K_scaled)), method = 'ward.D'))
od = order.dendrogram(dend)
selected1 = od[which(labels(dend) %in% epi_genes)]
selected2 = od[which(labels(dend) %in% infl_genes)]
ComplexHeatmap::Heatmap(Biobase::exprs(pml2K_scaled),
name="expression",
top_annotation=hm_annot,
cluster_rows=dend,
row_dend_reorder = F,
cluster_columns=TRUE,
clustering_distance_columns="euclidean",
clustering_method_columns="ward.D",
column_split=3,
show_parent_dend_line=TRUE,
row_title="",
show_column_names=FALSE,
show_row_names=FALSE) +
rowAnnotation(link1 = anno_mark(at = selected1,
link_width = unit(0, "mm"),
link_gp = gpar(lwd=0.1),
padding = unit(0, "mm"),
labels = rownames(exprs(pml2K_scaled))[selected1], side = "right",
labels_gp = gpar(fontsize = 2,
fontface = "bold",
col = "red"
)))+
rowAnnotation(link2 = anno_mark(at = selected2,
link_width = unit(0, "mm"),
link_gp = gpar(lwd=0.1),
padding = unit(0, "mm"),
labels = rownames(exprs(pml2K_scaled))[selected2], side = "right",
labels_gp = gpar(fontsize = 2,
fontface = "bold",
col = "darkgreen"
)))